%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%% Topology-Preserving Selection and Clustering of Multi-Dimensional Biological Data %%%%%%%%%%%

% We assume you have installed MATLAB version 6.5 (R13) or later in one of
% three operating systems (i.e., Windows OS, Linux/Unix, and Mac OS)

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%                                         NOTE                                             %%%%%%%%
% Before DEMO, please run the following scripts at the command line so as to add specified directories  %
% (i.e., m-files for functions and .mat for microarray data) to the current MATLAB search path.          %
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

% main working directory
maindir=pwd;

% add directories to MATLAB search path
% if using Windows OS, follow these scripts
addpath([maindir,'\somtoolbox'])
addpath([maindir,'\somsvd'])
addpath([maindir,'\microarray_db'])
% if using Linux or Mac OS, follow these scripts
addpath([maindir,'/somtoolbox'])
addpath([maindir,'/somsvd'])
addpath([maindir,'/microarray_db'])

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%                                                                                          %%%%%%%%
%%%%%%%%                                         DEMO                                             %%%%%%%%
%%%%%%%%                     Illustration of the proposed methodology                             %%%%%%%%
%%%%%%%% by processing genome-scale expression data of human cells during various stress responses %%%%%%%
%%%%%%%%                                                                                          %%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

% Data: primary gene expression matrix (36,164 genes X 76 samples)
%% after preliminary clean-up procedures including transformation relative to zero time point for each 
%% time course by subtracting the median log2 expression ratios in the time-zero replicates, and 
%% exclusion of those with missing values more than 40%, and finally excluding samples at zero time 
%% points for our follow-up data. Details of stress conditions are as follows
%% Heat Shock (HS)
%%% Heat shock HeLa 0.5 1 2 3 4 6 8 24h (HS_H)
%%% Heat shock Fibroblast 0.5 1 2 4 10 16 24h (HS_F)
%%% Heat shock K562 1 2 3 4 6 8 24h (HS_K)
%% ER Stress (ER)
%%% ER stress HeLa Tunicamycin 0.5 1 2 4 6 8 12 24 36h (ER_HT)
%%% ER stress HeLa DTT 0.5 1 2 4 6 8 16 24 30h (ER_HD)
%%% ER stress Fibroblast DTT 0.5 1 2 3 4 6 8 12 16 24 36h (ER_FD)
%% Oxidatvie Stress (OS)
%%% Oxidative stress HeLa Hydrogen peroxide 0.5 1 2 6 8 16 24 30h (OS_HH)
%%% Oxidative stress HeLa Menadione 0.5 1 2 4 8 12 24 32h (OS_HM)
%%% Oxidative stress Fibroblast Menadione 0.5 1 3 4 6 8 12 24 36h (OS_FM)

% load the primary gene expression and name of each microarray sample, containing:
%% D_STRESS: the gene expression matrix 36,164 X 76, with missing value denoted as NaN
%% cnames: the name of each microarray sample, here 1 X 76
load Human_cell_stress_data_Murray_et_al_2004_MBC.mat 

% notes: Since large microarray data (number of genes/rows in D_STRESS) are to be processed, it may 
%        take several minutes, and even half hours to finish training depending on your computating systems. 
%        Just for demonstration,
%        you can extract the first 10,000 number of genes and continue the following scripts for DEMO. 
%% if the first 10,000 number of genes are extracted from D_STRESS for
%% DEMO, please run the 1st script (it takes a few while to complete)
%% before continuing, or else run 2nd script for the whole dataset (maybe you need a cup of tea or coffee).

% 1st script:
demoD_STRESS=D_STRESS(1:10000,:);
% 2nd script:  
demoD_STRESS=D_STRESS;

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%% Data pre-processing using SOM with epanechikov (EP) kernel function %%%%%%%%%%%%%%%%%

% Step 1: Non-linear transformation of primary gene expression data (gene expression matrix as input) 
%         into output codebook matrix using SOM with EP kernel function

%% create data structure for SOM training
sD=som_data_struct(demoD_STRESS,'comp_names',cnames);

%% The input data are processed sequentially via determination of map units number and map size, linear 
%% initialization, one rough batch training epochs, and the proper repeated fine tuning epochs until the 
%% two measures of som quality (average quantization error and topographic error) do not change any more.
%% note: 'som_make_modified' is the modified function from 'som_make' in the directory 'somtoolbox'
%%% A real-time figure will show up to trace the training process,
%%% wherein red cross represents the input data while a training SOM map is denoted by solid dots and their
%%% connections (in gray line). To faciliate the human-centered visualization, input data and training map 
%%% are projected onto 2-dimensional space. Use your imagination to think
%%% over what is happening in the high-dimensional hypespace
sMap_ep=som_make_modified(sD, 'neigh', 'ep');


%% component plane presentations (CPPs) of sMap_ep
cppsom_stress(sMap_ep,cnames,1);% 1 for maximum values of color limits
%% notes: the m-file/function 'cppsom_stress' is just suitable for visualization of Human stress data.
%%        For microarray data using different samples, you can modify the function to visualize the data 
%%        according to user's desire.


%% Output of trained data
[Dunit,Mhits]=Data_output(sMap_ep,sD);
%%% For Dunit, 1st column:           Original order of input data;
%%%            2nd column:           Best Matching Unit belonging to;
%%%            3rd to last columns:  Original input data.
%%% For Mhits, 1st column:           Neuron ID;
%%%            2nd column:           Hit histogram;
%%%            3rd to last columns:  Codebook matrix.

%% Write formatted data to files with .xls extension
%% flag=0:Standard dialog box for saving files using user-defined filenames
%% flag=1: 'Dunit.xls' and 'Mhits.xls' stored in the main working directory (by default)
flag=1;
somsvd_write(Dunit,Mhits,cnames,flag)

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% Hybrid SOM-SVD for gene selection %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

% Step 2:	Linear decomposition of the resulting codebook matrix by SVD

%% D:codebook matrix created by SOM with EP kernel function
D=sMap_ep.codebook;
%% SVD of codebook matrix
[U,S,V]=svd(D,0);
%% Graphic depiction of Singular Value Decomposition using Heatmap
svd_heatmap(D) 


% Step 3:	Calculation of distance statistic for each neuron, 
%           following dominant eigenvector selection and SVD subspace projection
% Step 4:   Using FDR multiple comparison procedure for significant neuron assessment 
%           and subsequent gene selection

%% Barchart for S and estimation of the number of significant eigenvectors 
S_barchart(D)
%% select those eigenvectors as dominant eigenvectors, the observed relative eigenexpression of which is 
%% beyond the maximum random relative eigenexpression ( or further constraited at the probability of at least 99%, corresponding to p-value<0.01)
%% Here the first eight dominant eigenvectors are chosen, capturing essential patterns inherited to the data
neigenvectors=8;

%% Pair-wise SVD projection scatter plots
svd_scatterplots(D,V,neigenvectors)

%% Assess statistical significance in terms of FDR for each neuron
B=1000; % the number of randomized reference datasets
fdr=svd_fdr(D,V,neigenvectors,B); % it may take a while. Just enjoy another cup of tea or coffee

%% The number of neurons and genes selected under indicated FDR threshold
hits=Mhits(:,2);
fdr_threshold(fdr,hits)
%%% Note: It is shown that an obvious change of the sharp slope on either neuron line or gene line occurs 
%%% at the FDR of 0.01, implicating that this FDR cutoff may possibly lead to the maximal retaining of 
%%% information with minimal noise capturing in contrast to other cutoffs in this data setting
cutoff=0.01

%% Visualize the nodes selected under indicated FDR using SOM
fdr_som(sMap_ep,fdr,cutoff)

%% Select significant nodes under the given endurable FDR level, and subsequently their corresponding genes 
%%% nodes selected under indicated FDR cutoff
nodes_selected=Mhits(fdr<cutoff,1);
%%% genes selected under indicated FDR cutoff
Dunit_selected=Dunit(ismember(Dunit(:,2),nodes_selected),:); 
%%%%% For Dunit_selected, 1st column:        original input ID; 
%%%%%                     2nd column:        node index;
%%%%%                     The rest columns:  selected gene expression matrx

clear D U S V hits

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%% Two-phase gene clustering of selected genes through SOM-based methods %%%%%%%%%%%%%%%%%%%%

% Step 5:	Two-phase gene clustering of selected genes through SOM-based methods, including 
%           the selected gene expression matrix trained by SOM with Gaussian kernel function (1st phase) and 
%           distance matrix based clustering of SOM (2nd phase).

%% D: the selected gene expression matrix
D=Dunit_selected(:,3:end);
%% for the order of Dunit
ID=Dunit_selected(:,1);

%% In the first phase, the input data are trained by SOM with Gaussian kernel relation to better preserve 
%% the topology of the data
sD_selected=som_data_struct(D,'comp_names',cnames);
%%% Another real-time figure will show up again to trace the training process.
%%% Compared to the previous SOM map with Epanechikov kernel, this SOM map
%%% trained with Gaussian kernel is more smooth globally. That is why we
%%% propose the use of Epanechikov kernel for tasks such as topology-preserving gene selection 
%%% and use of Gaussian kernel with the aim of global gene clustering.
sMap=som_make_modified(sD_selected, 'neigh','gaussian');

%% CPP-SOM of sMap
cppsom_stress(sMap,cnames,1);% 1 for maximum values of color limits

%% In the second phase, the trained map is divided into a set of bases (clusters) using region growing 
%% procedure, which starts with local minima of the distance matrix as seeds, followed by the assignment 
%% of the remaining nodes to the corresponding base
[Dunit_base,Mhits_base]=som_bases(sMap,sD_selected,ID);
%%% For Dunit_base, 1st column:           Original order of input data (sD.data);
%%%                 2nd column:           Best Matching Unit belonging to;
%%%                 3rd column:           Base belonging to; 
%%%                 4th to last columns:  Selected gene expression matrix.
%%% For Mhits_base, 1st column:           node ID;
%%%                 2nd column:           Hit histogram;
%%%                 3rd column:           Seed flag, 1 for Yes,0 for No;
%%%                 4th column:           Base;
%%%                 5th to last columns:  Codebook matrix.

%% Write formatted data to files with .xls extension
%% flag=0:Standard dialog box for saving files using user-defined filenames
%% flag=1: 'Dunit_base.xls' and 'Mhits_base.xls' stored in the main working directory (by default)
flag=1;
somsvd_write(Dunit_base,Mhits_base,cnames,flag)

%% Optionally, you can run the following scripts for 
%% Hierarchical displays illustrating patterns of gene expression within each bases/clusters in 

%%% Save information
outdir='Output_Hierarchical displays of each base';fmt='.tif';
saveinfo{1}=outdir; % directory for pictures stored
saveinfo{2}=fmt;% figures saved using specified format
%%% create the corresponding directory to store each set of figures
mkdir(outdir)
%%% Hierarchical displays illustrating patterns of gene expression within each base/cluster
Hcluster_bases(Dunit_base,saveinfo)
%%%% notes: % m-file/function 'Hcluster_bases' is just suitable for Human cell stress data

clear D flag outdir fmt

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% Step 6: 	Potential utility of obtained clusters (bases) for downstream analyses, 
%           e.g. functional enrichment analysis, or search for TFBSs enrichments 
% Notes: This part is omitted due to the dependency of databases used. 
%	 However, the details will be available upon request		